Overview

This file loads produces Figures and runs analysis using the real data.

library(tidyverse)
library(VectraPolarisData)
library(ggridges)
library(MASS)
library(pscl)
library(patchwork)
library(viridis)

library(data.table)
library(Rtsne)
library(Rphenograph)

knitr::opts_chunk$set(echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width = 9,
  fig.height = 4,
  fig.path = './figs/'
)

theme_set(theme_bw() + theme(legend.position = "bottom"))

Install and load data

The data is publicly available on Bioconductor as the package VectraPolarisData. You can install it here:

if (!requireNamespace("BiocManager", quietly = TRUE)) {
     install.packages("BiocManager")
}

BiocManager::install("VectraPolarisData")

Ovarian data

Converting the ovarian cancer dataset from a SpatialExperiment object to a dataframe.

library(VectraPolarisData)
spe_ovarian <- HumanOvarianCancerVP()

## Assays slots
assays_slot <- assays(spe_ovarian)
intensities_df <- assays_slot$intensities
nucleus_intensities_df<- assays_slot$nucleus_intensities
rownames(nucleus_intensities_df) <- paste0("nucleus_", rownames(nucleus_intensities_df))
membrane_intensities_df<- assays_slot$membrane_intensities
rownames(membrane_intensities_df) <- paste0("membrane_", rownames(membrane_intensities_df))

# colData and spatialData
colData_df <- colData(spe_ovarian)
spatialCoords_df <- spatialCoords(spe_ovarian)

# clinical data
patient_level_ovarian <- metadata(spe_ovarian)$clinical_data %>%
  # create binary stage variable
  mutate(stage_bin = ifelse(stage %in% c("1", "2"), 0, 1))

cell_level_ovarian <- as.data.frame(cbind(colData_df, 
                                     spatialCoords_df,
                                     t(intensities_df),
                                     t(nucleus_intensities_df),
                                     t(membrane_intensities_df))
                               ) %>%
  dplyr::rename(cd19 = cd19_opal_480,
                cd68 = cd68_opal_520,
                cd3 = cd3_opal_540, 
                cd8 = cd8_opal_650,
                ier3 = ier3_opal_620,
                pstat3 = p_stat3_opal_570,
                ck = ck_opal_780,
                ki67 = ki67_opal_690)


# data frame with clinical characteristics where each row is a different cell
#ovarian_df <- full_join(patient_level_ovarian, cell_level_ovarian, by = "sample_id")

rm(spe_ovarian, assays_slot, intensities_df, nucleus_intensities_df, membrane_intensities_df, colData_df, spatialCoords_df)

Lung data

spe_lung <- HumanLungCancerV3()

## Assays slots
assays_slot <- assays(spe_lung)
intensities_df <- assays_slot$intensities
nucleus_intensities_df<- assays_slot$nucleus_intensities
rownames(nucleus_intensities_df) <- paste0("nucleus_", rownames(nucleus_intensities_df))
membrane_intensities_df<- assays_slot$membrane_intensities
rownames(membrane_intensities_df) <- paste0("membrane_", rownames(membrane_intensities_df))

# colData and spatialData
colData_df <- colData(spe_lung)
spatialCoords_df <- spatialCoords(spe_lung)

# clinical data
patient_level_lung <- metadata(spe_lung)$clinical_data 

cell_level_lung <- as_tibble(cbind(colData_df, 
                                     spatialCoords_df,
                                     t(intensities_df),
                                     t(nucleus_intensities_df),
                                     t(membrane_intensities_df))
                               )   %>%
  dplyr::rename(cd19 = cd19_opal_650,
                cd3 = cd3_opal_520, 
                cd14 = cd14_opal_540,
                cd8 = cd8_opal_620,
                hladr = hladr_opal_690,
                ck = ck_opal_570)



# data frame with clinical characteristics where each row is a different cell
#lung_df <- full_join(patient_level_lung, cell_level_lung, by = "slide_id")

rm(spe_lung, assays_slot, intensities_df, nucleus_intensities_df, membrane_intensities_df, colData_df, spatialCoords_df)

Introduction

Normalization

As an illustrative example will use the mxnorm package to normalize the lung data.

Setting up mx_dataset object

Use slide_id as slide identifier - Use sample_id as image identifier - Use the following marker columns: - cd19 - cd14 - cd3 - cd8 - hladr - ck - dapi - Use tissue_category as a metadata column

library(mxnorm)
mx_data = mx_dataset(data = cell_level_lung,
                     slide_id = "slide_id",
                     image_id = "sample_id",
                     marker_cols = c("cd19",
                                     "cd3",
                                     "cd14",
                                     "cd8",
                                     "hladr",
                                     "ck",
                                     "dapi"
                                     ),
                     metadata_cols = c("tissue_category", "phenotype_ck", "phenotype_cd8", "phenotype_cd14",
                                       "phenotype_other", "phenotype_cd19"))



summary(mx_data)
## Call:
## `mx_dataset` object with 153 slide(s), 7 marker column(s), and 6 metadata column(s)

And let’s look at the data object we’re going to use going forward:

knitr::kable(head(mx_data$data))  %>%
    kableExtra::kable_styling() %>% kableExtra::scroll_box(width = "100%")
slide_id sample_id cd19 cd3 cd14 cd8 hladr ck dapi tissue_category phenotype_ck phenotype_cd8 phenotype_cd14 phenotype_other phenotype_cd19
#01 0-889-121 #01 0-889-121 P44_[40864,18015].im3 0.201 0.025 0.155 0.132 0.115 1.024 5.166 Tumor CK- CD8- CD14- Other- CD19-
#01 0-889-121 #01 0-889-121 P44_[40864,18015].im3 0.546 0.172 1.799 0.821 0.239 0.721 7.477 Stroma CK- CD8- CD14- Other- CD19-
#01 0-889-121 #01 0-889-121 P44_[40864,18015].im3 1.388 0.060 0.173 0.989 0.268 0.391 8.237 Stroma CK- CD8- CD14- Other- CD19-
#01 0-889-121 #01 0-889-121 P44_[40864,18015].im3 0.683 0.004 1.136 0.174 0.245 0.468 3.786 Stroma CK- CD8- CD14- Other- CD19-
#01 0-889-121 #01 0-889-121 P44_[40864,18015].im3 0.176 0.034 0.176 0.191 0.127 1.608 6.055 Tumor CK+ CD8- CD14- Other- CD19-
#01 0-889-121 #01 0-889-121 P44_[40864,18015].im3 0.179 0.052 0.227 0.227 0.136 1.147 12.131 Tumor CK+ CD8- CD14- Other- CD19-

Normalize using mx_normalize()

Normalize using two methods: log10, and mean_divide. Then, calculate Otsu discordance metrics.

mx_norm = mx_normalize(mx_data,
                       transform = "log10_mean_divide",
                       method = "None")


# Otsu
mx_norm = run_otsu_discordance(mx_norm,
                            table="both",
                            thresold_override=NULL,
                            plot_out = FALSE)

# umap
mx_norm = run_reduce_umap(mx_norm,
                          table="both",
                          marker_list = mx_data$marker_cols,
                          downsample_pct = .1,
                          metadata_cols = mx_data$metadata_cols)

Summary table that calculates the metrics from the Bioinformatics paper is printed below

# calculate metrics for unnormalized and normalized data
summary(mx_norm)
## Call:
## `mx_dataset` object with 153 slide(s), 7 marker column(s), and 6 metadata column(s)
## 
## Normalization:
## Data normalized with transformation=`log10_mean_divide` and method=`None`
## 
## Anderson-Darling tests:
##       table mean_test_statistic mean_std_test_statistic mean_p_value
##  normalized             478.249                  34.932            0
##         raw            1064.311                  97.682            0
## 
## Threshold discordance scores:
##       table mean_discordance sd_discordance
##  normalized            0.039          0.048
##         raw            0.098          0.103
## 
## Clustering consistency (UMAP):
##       table adj_rand_index cohens_kappa
##  normalized          0.007        0.000
##         raw          0.010       -0.001

Plotting

First, density plots. Only showing density plots for unnormalized cells for two types of immune markers and 3 subjects.

set.seed(103001)
ids = sample(unique(cell_level_lung$slide_id), 3)
markers = c("cd19", "cd14")
mx_df = mx_data$data %>%
  pivot_longer(cd19:dapi, names_to = "marker", values_to = "marker_value") %>%
  filter(slide_id %in% ids)
  
  
cd14 = mx_df %>% 
    filter(marker %in% c("cd14")) %>%
    ggplot() +
  geom_line(stat = "density", aes(marker_value, group = sample_id, color = slide_id), 
            alpha=0.5, linetype = 2) +
    geom_density(aes(marker_value, group = slide_id,
                   color = slide_id), size = 1.25) +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~marker, scales = "free", ncol = 1) +
  theme(legend.position = "none") +
  xlim(0, 1) +
  labs(x = "")


cd19 = mx_df %>%
  filter(marker %in% c("cd19")) %>%
  ggplot() +
  geom_line(stat = "density", aes(marker_value, group = sample_id, color = slide_id), 
            alpha=0.5, linetype = 2) +
    geom_density(aes(marker_value, group = slide_id,
                   color = slide_id), size = 1.25) +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~marker, scales = "free", ncol = 1) +
  theme(legend.position = "none") +
  xlim(0, 0.5) +
  labs(x = "marker value")

Discordance plots. Show for subset of markers and maybe subset of subjects.

set.seed(103001)
ids = sample(unique(cell_level_lung$slide_id), 15)

otsu_data = mx_norm$otsu_data %>%
  filter(marker %in% c("cd14", "cd19", "cd8", "cd3", "dapi", "ck"),
         slide_id %in% ids) %>%
  mutate(slide_id = as.numeric(factor(slide_id)),
         norm = factor(table, levels = c("raw", "normalized"))) 

point_size = 2
    

mean_vals = otsu_data %>% 
  group_by(norm, slide_id) %>% 
  summarize(m1 = mean(discordance_score), .groups = "drop")
        

disc = otsu_data %>%
  ggplot() + 
  geom_point(aes(discordance_score, factor(slide_id), color = marker), size = point_size) +
  facet_wrap(~ norm) +
  geom_point(data = mean_vals, aes(m1, slide_id), color = "black", fill = "white", 
             shape = 23, size = point_size) +
  labs(x = "discordance score", y = "slide id") +
  theme(legend.position = c(.9, .7))

Putting the plots together

(cd14 / cd19) / disc + plot_layout(heights = c(1, 1, 2.5))

rm(mx_data, mean_vals, otsu_data, discordance_score, ids, markers, point_size, mx_df)

Phenotyping

Add in phenograph plots. Clustering using marker values for a single subject. First using unnormalized marker values.

unnorm_df = mx_norm$data %>%
  filter(slide_id == "#01 0-889-121") %>%
  dplyr::select(cd19, cd3, cd14, cd8, ck, contains("phenotype")) %>%
  mutate(phenotype = case_when(
    phenotype_cd14 == "CD14+" ~ "CD14+",
    phenotype_cd19 == "CD19+" ~ "CD19+",
    phenotype_cd8 == "CD8+" ~ "CD8+",
    phenotype_ck == "CK+" ~ "CK+",
    TRUE ~ "other"
  ))
  

unnorm_df = na.omit(unnorm_df)
unnorm_df_mat = as.matrix(unnorm_df[, 1:5])


# tsne 
tsne_results = Rtsne(unnorm_df_mat, dims = 2,check_duplicates = FALSE)
unnorm_df$tsne1 = tsne_results$Y[,1]  
unnorm_df$tsne2 = tsne_results$Y[,2]


# phenograph
phen_results = Rphenograph(unnorm_df_mat, k = 100)
##   Finding nearest neighbors...DONE ~ 0.372 s
##   Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 46.734 s
##   Build undirected graph from the weighted links...DONE ~ 2.785 s
##   Run louvain clustering on the graph ...DONE ~ 6.541 s
##   Return a community class
##   -Modularity value: 0.851991 
##   -Number of clusters: 16
unnorm_df$cluster <- factor(membership(phen_results[[2]]))

Now using normalized marker values

norm_df = mx_norm$norm_data %>%
  filter(slide_id == "#01 0-889-121") %>%
  dplyr::select(cd19, cd3, cd14, cd8, ck, contains("phenotype"))

norm_df = na.omit(norm_df)
norm_df_mat = as.matrix(norm_df[, 1:5])


# tsne 
#tsne_results = Rtsne(norm_df_mat, dims = 2,check_duplicates = FALSE)
norm_df$tsne1 = tsne_results$Y[,1]  
norm_df$tsne2 = tsne_results$Y[,2]


# phenograph
phen_results = Rphenograph(norm_df_mat, k = 100)
##   Finding nearest neighbors...DONE ~ 0.558 s
##   Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 45.479 s
##   Build undirected graph from the weighted links...DONE ~ 2.733 s
##   Run louvain clustering on the graph ...DONE ~ 5.872 s
##   Return a community class
##   -Modularity value: 0.7828095 
##   -Number of clusters: 14
norm_df$cluster <- factor(membership(phen_results[[2]]))

Plotted results for unnormalized phenograph labels, normalized phenograph labels, and Inform labels. Results are plotted for single subject with 5` ROIs.

p1 = unnorm_df %>%
  ggplot(aes(tsne1, tsne2, color = phenotype)) +
  geom_point(alpha = 0.5, size = 0.75) +
  labs(x = "TSNE 1", y = "TSNE 2", title = "Inform") +
  theme(legend.title=element_blank(), legend.position = "right") 

p2 = unnorm_df %>%
  ggplot(aes(tsne1, tsne2, color = cluster)) +
  geom_point(alpha = 0.5, size = 0.75) +
  labs(x = "TSNE 1", y = "", title = "Phenograph, unnormalized") +
  guides(fill=guide_legend(title="New Legend Title")) +
  theme(legend.position = "right")

p3 = norm_df %>%
  ggplot(aes(tsne1, tsne2, color = cluster)) +
  geom_point(alpha = 0.5, size = 0.75) +
  labs(x = "TSNE 1", y = "", title = "Phenograph, normalized") +
  theme(legend.position = "right")

p1 / p2 / p3

Compositional analysis

Data is currently in cell-level format (each row is a cell). To make it easier for modeling counts and proportions, going to summarize the dataset so that each row is a subject and cell types are reported as counts.

Look at only tumor areas.

# aggregate to cell counts for each subject 
ovarian_counts = cell_level_ovarian %>%
  group_by(sample_id, tissue_category) %>%
  summarize(total_cells = n(),
            cd68_positive_cells = length(phenotype_cd68[phenotype_cd68 == "CD68+"]),
            ki67_positive_cells = length(phenotype_ki67[phenotype_ki67 == "Ki67+"]),
            ck_positive_cells = length(phenotype_ck[phenotype_ck == "CK+"]),
            cd19_positive_cells = length(phenotype_cd19[phenotype_cd19 == "CD19+"]),
            pstat3_positive_cells = 
              length(phenotype_p_stat3[phenotype_p_stat3 == "pStat3+"]),
            cd3_positive_cells = length(phenotype_cd3[phenotype_cd3 == "CD3+"]),
            cd8_positive_cells = length(phenotype_cd8[phenotype_cd8 == "CD8+"]),
            cd3plus_cd8plus_cells = length(phenotype_cd8[phenotype_cd8 == "CD8+" &
                                                           phenotype_cd3 == "CD3+"])
            ) %>%
  ungroup()  %>%
  dplyr::select(sample_id, tissue_category, total_cells, cd68_positive_cells, cd19_positive_cells, cd3_positive_cells,
         cd8_positive_cells) %>%
  filter(tissue_category != "Glass") %>%
  pivot_longer(cd68_positive_cells:cd8_positive_cells, names_to = "cell_type", values_to = "count") %>%
  mutate(proportion = count / total_cells,
         cell_type = factor(cell_type, levels = c("cd3_positive_cells", "cd8_positive_cells",
                                                    "cd19_positive_cells", "cd68_positive_cells"),
                              labels = c("CD4 T cells", "CD8 T cells", "B cells", "Macrophages")))



ovarian_counts = full_join(dplyr::select(patient_level_ovarian, sample_id, stage_bin, BRCA_mutation, 
                                         primary, age_at_diagnosis),
                           ovarian_counts, by = "sample_id") %>%
    filter(proportion < 0.75, 
         tissue_category == "Tumor") %>%
    mutate(stage_bin = factor(stage_bin, levels = 0:1, labels = c("Stage I/II", "Stage III/IV")),
           BRCA_mutation = factor(BRCA_mutation, levels = 0:1, labels = c("BRCA-", "BRCA+"))) 

Histograms of cell proportions

# by brca mutation
ovarian_counts %>%
  filter(!is.na(BRCA_mutation)) %>%
  ggplot(aes(proportion,cell_type, fill = cell_type, height = stat(density))) + 
  geom_density_ridges(stat = "binline", bins = 15, scale = .95, draw_baseline = FALSE) +
  scale_y_discrete(expand = expansion(add = c(0.05, 0.4))) +
  facet_wrap(~ BRCA_mutation) +
  geom_vline(xintercept = 0, linetype = 2) +
  theme(legend.position = "none") +
  labs(x = "proportion of cells in tumor area", y = "cell type")

Proportions

Calculate raw proportions across cell types:

summary_stats = ovarian_counts %>%
  group_by(cell_type) %>%
  summarize(model = "naive",
            proportion = mean(proportion),
            lower = proportion - 1.96 * sqrt( (proportion * (1-proportion))/132),
            upper = proportion + 1.96 * sqrt( (proportion * (1-proportion))/132)
            ) %>%
  ungroup() %>%
  dplyr::select(cell_type, model, proportion, lower, upper)

Model the proportions as a function of BRCA mutation.

# define function to extract proportions and standard deviations for each cell type
get_proportions = function(model, model_name){
  if(model_name %in% c("ZIP","ZINB") ){
    ci = broom:::broom_confint_terms(model) %>%
      filter(grepl("BRCA", term)) %>%
      mutate(lower = exp(conf.low), upper = exp(conf.high),
             term = str_remove(term, "count_"))
    
    df = left_join(as_tibble(summary(model)$coefficients$count, rownames = "term"),
                  ci) %>%
      janitor::clean_names() %>%
      filter(grepl("BRCA", term)) %>%
      mutate(BRCA = exp(estimate), p = pr_z,
             cell_type = c("CD4 T cells", "CD8 T cells", "B cells", "Macrophages"),
           model = model_name) %>%
      dplyr::select(cell_type, model, BRCA, lower, upper, p)
      return(df)
  }
  broom::tidy(model, exp = TRUE, conf.int = TRUE) %>% 
    filter(grepl("BRCA", term)) %>%
    mutate(cell_type = c("CD4 T cells", "CD8 T cells", "B cells", "Macrophages"),
           model = model_name,
           p = p.value) %>%
    dplyr::select(cell_type, model, BRCA = estimate, lower = conf.low, upper = conf.high, p)
}

# naive model (square root transform)


# poisson regression
mod_pois = glm(count ~ cell_type * BRCA_mutation  + offset(log(total_cells)), 
           family = "poisson",
           data = ovarian_counts)

prop_pois = get_proportions(mod_pois, "Poisson")

# logistic regression
mod_bin = glm(cbind(count, total_cells - count) ~ cell_type * BRCA_mutation, 
           family = binomial,
           data = ovarian_counts)

prop_bin = get_proportions(mod_bin, "binomial")

# negative binomial regression
mod_negbin = glm.nb(count ~ cell_type * BRCA_mutation + offset(log(total_cells)), 
                    data = ovarian_counts)

prop_negbin = get_proportions(mod_negbin, "negbinomial")

# zero inflated poisson?
mod_zeroinfl <- zeroinfl(count ~ cell_type * BRCA_mutation + offset(log(total_cells)) | 1,
               dist = "poisson",
               data = ovarian_counts)


prop_zeroPois = get_proportions(mod_zeroinfl, "ZIP")

#model = mod_zeroinfl
#model_name = "ZIP"

# zero inflated negbin
mod_zeroNegBin <- zeroinfl(count ~ cell_type * BRCA_mutation  + offset(log(total_cells)) | 1,
               dist = "negbin",
               data = ovarian_counts)

prop_zeroNegBin = get_proportions(mod_zeroNegBin, "ZINB")





## combine results
# make some other more informative and prettier plots. These p-values are super significant.
bind_rows(naive, prop_pois, prop_bin, prop_negbin,prop_zeroPois, prop_zeroNegBin) %>%
  #filter(model != "naive") %>%
  mutate(model = factor(model, levels = c("naive", "binomial", "Poisson", "negbinomial", "ZIP", "ZINB"))) %>%
  ggplot(aes(model, BRCA, color = model, shape = model)) +
  geom_point(position = position_dodge(width = 0.3), size = .8) +
  geom_errorbar(aes(ymin = lower, ymax = upper), position = position_dodge(width = 0.3), width = .1) +
  #ylim(-.1, 0.075) +
  theme_minimal() +
  facet_wrap(~cell_type) +
  labs(x = "")

Spatial analysis

Use ovarian data, calculate K function. Show picture with high clustering, low clustering, poor quality. Use radius from Ben’s paper. Look at bivariate K as well and show association with survival.